suppressWarnings(library(tidyverse))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suppressWarnings(library(leaflet))
suppressWarnings(library(cluster))
suppressWarnings(library(factoextra))
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
suppressWarnings(library(glmnet))
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
suppressWarnings(library(PerformanceAnalytics))
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following object is masked from 'package:leaflet':
## 
##     addLegend
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
# Cargar los datasets
train <- read.csv("train.csv")
test <- read.csv("test.csv")
# Imputar valores nulos en total_bedrooms
train$total_bedrooms[is.na(train$total_bedrooms)] <- median(train$total_bedrooms, na.rm = TRUE)
test$total_bedrooms[is.na(test$total_bedrooms)] <- median(test$total_bedrooms, na.rm = TRUE)

No Funciono / No dio resultado satisfactorios

Visualizacion de las variables Espaciales

Cada intento con clusterizacion no redujo el RMSE y por eso de descarto su uso

Latitud, Longitud y Ocean Proximity

ggplot(train, aes(x = longitude, y = latitude, color = ocean_proximity)) +
  geom_point()  +
  labs(title = "Mapa de precios de casas por coordenadas",
       x = "Longitud",
       y = "Latitud",
       color = "Cercania")

Longitud, Latitud y Median House Value

ggplot(train, aes(x = longitude, y = latitude, color = median_house_value)) +
  geom_point() +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Mapa de precios de casas por coordenadas",
       x = "Longitud",
       y = "Latitud",
       color = "Precio de la Casa")

Longitud, Latitud y Median House Value, con mapa

m <- leaflet(train) %>%
  addTiles() %>%
  addCircleMarkers(~longitude, ~latitude, radius = ~median_house_value/100000, 
                   color = ~ifelse(median_house_value > median(train$median_house_value), 'red', 'blue'), 
                   popup = ~paste("median_house_value:", median_house_value))

m

Clusterizacion

train$ocean_proximity_num <- as.numeric(as.factor(train$ocean_proximity))

Visualizacion Para el Metodo del Codo

# Calcular la silueta promedio para k = 2 a 10
k_values <- 2:15

fviz_nbclust(train[, c("longitude", "latitude", "ocean_proximity_num")], kmeans, method = "wss")

Visualizacion Para el Metodo de la Silueta

calcular_silueta_promedio <- function(data, k) {
  set.seed(42)
  kmeans_result <- kmeans(data, centers = k)
  silueta <- silhouette(kmeans_result$cluster, dist(data))
  mean(silueta[, 3])  # Promedio del coeficiente de silueta
}

silueta_promedio <- sapply(k_values, function(k) calcular_silueta_promedio(train[, c("longitude", "latitude", "ocean_proximity_num")], k))


# Visualizar los resultados
plot(k_values, silueta_promedio, type = "b", pch = 19,
     xlab = "Número de Clusters (k)", ylab = "Coeficiente de Silueta Promedio",
     main = "Coeficiente de Silueta Promedio para Diferentes k")

Visualizacion de los clusters y su posicion

num_clusters <- 5


clusters <- kmeans(train[, c("longitude", "latitude", "ocean_proximity_num")], centers = num_clusters)
train$position_cluster <- as.factor(clusters$cluster)

ggplot(train, aes(x = longitude, y = latitude, color = position_cluster)) +
  geom_point() +
  labs(title = "Clustering de casas por coordenadas",
       x = "Longitud",
       y = "Latitud",
       color = "Cluster")

Eliminar Outliers

Se descarto eliminar outliers porque provocaba overfiting

continuas =  names(train)[sapply(train, is.numeric) & names(train) != "median_house_value" & names(train) != "id"  &   30 < sapply(train, n_distinct)  ]
continuas
## [1] "longitude"          "latitude"           "housing_median_age"
## [4] "total_rooms"        "total_bedrooms"     "population"        
## [7] "households"         "median_income"

Pre Eliminacion

for (column_name  in  continuas) {
  
  vector_numerico = unlist(train[column_name])
  
  Q1 <- quantile(vector_numerico, 0.25)  # Primer cuartil (Q1)
  Q3 <- quantile(vector_numerico, 0.75)  # Tercer cuartil (Q3)
  IQR <- Q3 - Q1  # Rango intercuartílico
  
  # Definir los límites inferior y superior para identificar outliers
  limite_inferior <- Q1 - 1.5 * IQR
  limite_superior <- Q3 + 1.5 * IQR
  
  # Identificar los outliers
  outliers <- vector_numerico[vector_numerico < limite_inferior | vector_numerico > limite_superior]
  
  # Contar los outliers
  conteo_outliers <- length(outliers)
  
  # Resultados
  print(column_name)
  print(paste("Número de outliers identificados usando IQR:", conteo_outliers))
  print(paste("Prosnetaje de outliers identificados usando IQR:", conteo_outliers/nrow(train)))
}
## [1] "longitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "latitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "housing_median_age"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "total_rooms"
## [1] "Número de outliers identificados usando IQR: 912"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0631272928635703"
## [1] "total_bedrooms"
## [1] "Número de outliers identificados usando IQR: 914"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0632657299093237"
## [1] "population"
## [1] "Número de outliers identificados usando IQR: 861"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0595971481968575"
## [1] "households"
## [1] "Número de outliers identificados usando IQR: 859"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.059458711151104"
## [1] "median_income"
## [1] "Número de outliers identificados usando IQR: 486"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0336402021180868"
suppressWarnings(chart.Correlation(train[continuas], histogram=TRUE))

Eliminacion

remove_outliers <- function(df, columns) {
  for (col in columns) {
    Q1 <- quantile(df[[col]], 0.25)
    Q3 <- quantile(df[[col]], 0.75)
    IQR <- Q3 - Q1
    df <- df[!(df[[col]] < (Q1 - 1.5 * IQR) | df[[col]] > (Q3 + 1.5 * IQR)), ]
  }
  return(df)
}

train_no_outliers <- remove_outliers(train, continuas)

cat("Tamaño del dataset train_processed original: ", nrow(train), "\n")
## Tamaño del dataset train_processed original:  14447
cat("Tamaño del dataset train_processed sin outliers: ", nrow(train_no_outliers), "\n")
## Tamaño del dataset train_processed sin outliers:  12272

Post eliminacion

for (column_name  in  continuas) {
  
  vector_numerico = unlist(train_no_outliers[column_name])
  
  Q1 <- quantile(vector_numerico, 0.25)  # Primer cuartil (Q1)
  Q3 <- quantile(vector_numerico, 0.75)  # Tercer cuartil (Q3)
  IQR <- Q3 - Q1  # Rango intercuartílico
  
  # Definir los límites inferior y superior para identificar outliers
  limite_inferior <- Q1 - 1.5 * IQR
  limite_superior <- Q3 + 1.5 * IQR
  
  # Identificar los outliers
  outliers <- vector_numerico[vector_numerico < limite_inferior | vector_numerico > limite_superior]
  
  # Contar los outliers
  conteo_outliers <- length(outliers)
  
  # Resultados
  print(column_name)
  print(paste("Número de outliers identificados usando IQR:", conteo_outliers))
  print(paste("Prosnetaje de outliers identificados usando IQR:", conteo_outliers/nrow(train)))
}
## [1] "longitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "latitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "housing_median_age"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "total_rooms"
## [1] "Número de outliers identificados usando IQR: 178"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0123208970720565"
## [1] "total_bedrooms"
## [1] "Número de outliers identificados usando IQR: 53"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00366858171246626"
## [1] "population"
## [1] "Número de outliers identificados usando IQR: 68"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00470685955561708"
## [1] "households"
## [1] "Número de outliers identificados usando IQR: 10"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.000692185228767218"
## [1] "median_income"
## [1] "Número de outliers identificados usando IQR: 93"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00643732262753513"
suppressWarnings(chart.Correlation(train_no_outliers[continuas], histogram=TRUE))

Modelos Descartados

  • Lineal
  • Multilineal
  • Ridgge
  • Lasso
  • Elastic
  • Random Forest

Usados, pero superados

  • Extream Gradiant Boostiong